library(data.table)
library(ggplot2)
library(zoo)
library(Hmisc)
library(stringr)

## Helper functions for iBuyer ##
setwd("~/zfs/projects/faculty/buchak-ibuyer/code")

prepData <- function(Market = 'Phoenix') {
  
  print(paste('Prepping data for',Market))
  
  fields = c('pclidirisfrmtd','fipscode','transactiontype','propertytype','corporateindicator','owner1lastname','owner1firstnamemi','owneretalindicator','resalenewconstruction',
             'absenteeownerstatus','situscity','situszipcode','parcellevellatitude','parcellevellongitude','sellerlastname','sellername1','saleamount','saledate',
             'lendercompanycode','mortgageamount','mortgageloantypecode','lenderlastname','foreclosure','equityflag','refiflag',
             'censustract','landsquarefootage','universalbuilding_sqft','living_sqft','yearbuilt','airconditioning',
             'garage','heating','quality','roofcover','storiesnum','locationinfluence','electric','sewer','water')
  
  # iBuyers
  iBuyers = paste(c('opendoor','open door','\\<od [a-z].*',
                    'offerpad','offer pad',
                    'redfin','red fin',
                    'zillow',
                    'atlas',
                    'knock'),collapse = '|')
  
  # From Buchak et al (2018)
  fintechs = paste(c('quicken','amerisave','guaranteed','ntfn','internet',
                     '21st','first residential','movement','ne moves',
                     'nutter and company','cashcall','summit','cardinal','homeward'),collapse = '|')
  
  if(Market == 'Phoenix') {
    d.1 <- fread('../data/raw_deeds_data/MaricopaOpendoorBase1980_1989.csv')[,fields,with=F]
    d.2 <- fread('../data/raw_deeds_data/MaricopaOpendoorBase1990_1999.csv')[,fields,with=F]
    d.3 <- fread('../data/raw_deeds_data/MaricopaOpendoorBase2000_2009.csv')[,fields,with=F]
    d.4 <- fread('../data/raw_deeds_data/MaricopaOpendoorBase2010_2019.csv')[,fields,with=F]
    data <- rbind(d.1,d.2,d.3,d.4)
    rm(list=c('d.1','d.2','d.3','d.4'))
  } else if(Market == 'Las Vegas') {
    d.2 <- fread('../data/raw_deeds_data/32003OpendoorBase1990_1999.csv')[,fields,with=F]
    d.3 <- fread('../data/raw_deeds_data/32003OpendoorBase2000_2009.csv')[,fields,with=F]
    d.4 <- fread('../data/raw_deeds_data/32003OpendoorBase2010_2019.csv')[,fields,with=F]
    data <- rbind(d.2,d.3,d.4)
    rm(list=c('d.2','d.3','d.4'))
  } else if(Market == 'Orlando'){
    data <- fread('../data/raw_deeds_data/12095OpendoorBase.csv')[,fields,with=F]
  } else if(Market == 'Dallas') {
    data <- fread('../data/raw_deeds_data/48113OpendoorBase.csv')[,fields,with=F]
  } else if(Market == 'Charlotte') {
    data <- fread('../data/raw_deeds_data/37119OpendoorBase.csv')[,fields,with=F]
  } else if(Market == 'Raleigh') {
    data <- fread('../data/raw_deeds_data/37183OpendoorBase.csv')[,fields,with=F]
  } else if(Market == 'Gwinnett') {
    data <- fread('../data/raw_deeds_data/13135OpendoorBase.csv')[,fields,with=F]
  } else if(Market == 'Orange') {
    d.2 <- fread('../data/raw_deeds_data/6059OpendoorBase1990_1999.csv')[,fields,with=F]
    d.3 <- fread('../data/raw_deeds_data/6059OpendoorBase2000_2009.csv')[,fields,with=F]
    d.4 <- fread('../data/raw_deeds_data/6059OpendoorBase2010_2019.csv')[,fields,with=F]
    data <- rbind(d.2,d.3,d.4)
  } else if(Market == 'Los Angeles') {
    d.2 <- fread('../data/raw_deeds_data/6073OpendoorBase1990_1999.csv')[,fields,with=F]
    d.3 <- fread('../data/raw_deeds_data/6073OpendoorBase2000_2009.csv')[,fields,with=F]
    d.4 <- fread('../data/raw_deeds_data/6073OpendoorBase2010_2019.csv')[,fields,with=F]
    data <- rbind(d.2,d.3,d.4)
  } else if(Market == 'Houston') {
    d.2 <- fread('../data/raw_deeds_data/48201OpendoorBase1990_1999.csv')[,fields,with=F]
    d.3 <- fread('../data/raw_deeds_data/48201OpendoorBase2000_2009.csv')[,fields,with=F]
    d.4 <- fread('../data/raw_deeds_data/48201OpendoorBase2010_2019.csv')[,fields,with=F]
    data <- rbind(d.2,d.3,d.4)
  }
  
  ## Record the market
  data[,Market := Market]
  
  ## Filter out non-single family homes. If it's missing sometimes we can figure this out from othe recordings about the same property
  data[,propertytype := as.numeric(propertytype)]
  data[,single.family := max(as.numeric(propertytype==10),na.rm=T),by='pclidirisfrmtd']
  data[,condo         := max(as.numeric(propertytype==11),na.rm=T),by='pclidirisfrmtd']
  data <- data[single.family == 1 | condo == 1]

    
  ## Fix dates---only use the sale date. If no sale date it's not a sale...
  data[,date := saledate]
  data[date %% 100 == 0,date := NA]
  data[,date := as.Date(as.character(date),format = '%Y%m%d')]
  data$saledate <- NULL
  data[,year := year(date)]
  data <- data[!is.na(date)]
  
  ## Create Buyer / Seller Name
  data[,buyername := paste(owner1firstnamemi,owner1lastname)]
  data[,sellername := sellerlastname]
  data[sellername == '',sellername := sellername1]
  data[buyername == ' ',buyername := 'unknown']
  data[sellername == '',sellername := 'unknown']
  
  ## Only take arms' length transactions of existing homes or sales of new constructions.
  ## Primariliy filters out nominals & refis 
  data <- data[(transactiontype == 1 & resalenewconstruction == 'M') | (transactiontype == 3 & resalenewconstruction == 'N')]
  
  ## Create simple-id and remove duplicated transactions
  data[,id := paste(Market,pclidirisfrmtd)]
  data <- data[!duplicated(paste(date,id,buyername,sellername))]
  
  ## Identify iBuyers and fintech lenders
  data[,iBuyer.buyer  := as.integer(grepl(buyername,pattern=iBuyers,ignore.case=T))]
  data[,iBuyer.seller := as.integer(grepl(sellername,pattern=iBuyers,ignore.case=T))]
  data[,fintech.lender := as.integer(grepl(lenderlastname,pattern=fintechs,ignore.case=T))]
  
  ## Get lagged buyers / sellers
  data <- data[order(id,date)]
  data[,ltv := mortgageamount / saleamount]
  data[,seller.buy.date       := shift(date,type = 'lag'),by='id']
  data[,seller.buy.amount     := shift(saleamount,type= 'lag' ),by='id']
  data[,seller.fintech.lender := shift(fintech.lender,type= 'lag' ),by='id']
  data[,seller.ltv            := shift(ltv,type= 'lag' ),by='id']
  data[,seller.foreclosed     := shift(foreclosure,type= 'lag' ),by='id']
  data[,buyer.foreclosed      := shift(foreclosure,type= 'lead' ),by='id']
  
  # Fix names----going to take selerlastname and owner1lastname as defaults, and if they're missing, take the LAST STRING of buyer and seller name, unless it's like
  # 'llc' or something.
  
  # Corp names
  corpNames = c('CORP|LLC|INC|LP|SALES|CONSTRUCTION|TRUST|INC|unknown|HOMES|FNAM|DEV|JR|HUD|LTD|C|INS|MAE|ASSN|MARKETING|RECONVEYANCE|BK|FSB|CORPORATION|PROPERTIES|ADMN|ASSOCIATES|BANK|AMERICA|CORPS|DEVELOPMENT|AFFAIRS|INVESTMENTS|COUNTY|CORPORAT|HMS')
  
  data[,seller_lastname_clean := gsub(sellerlastname,pattern=corpNames,replace='')]
  data[,seller_lastname_clean := trimws(seller_lastname_clean)]
  data[seller_lastname_clean == '', seller_lastname_clean := trimws(word(trimws(gsub(sellername,pattern=corpNames,replace='')),-1))]
  data[seller_lastname_clean == '', seller_lastname_clean := NA]
  
  data[,buyer_lastname_clean := gsub(owner1lastname,pattern=corpNames,replace='')]
  data[,buyer_lastname_clean := trimws(buyer_lastname_clean)]
  data[buyer_lastname_clean == '', buyer_lastname_clean := trimws(word(trimws(gsub(buyername,pattern=corpNames,replace='')),-1))]
  data[buyer_lastname_clean == '', buyer_lastname_clean := NA]
  
  
  
  
  write.table(data,file=paste0('../data/processed_deeds_data/',gsub(Market,pattern=' ',replacement = '_'),'_processed.csv'),row.names=F,col.names=T)
  
}


processMarkets <- function() {
  markets = c('Phoenix','Orlando','Dallas','Las Vegas','Gwinnett')
  for(mm in markets) {
    prepData(mm)
  }
  combineData()
}

combineData <- function() {
  phoenix.in   <- fread('../data/processed_deeds_data/Phoenix_processed.csv')
  orlando.in   <- fread('../data/processed_deeds_data/Orlando_processed.csv')
  dallas.in    <- fread('../data/processed_deeds_data/Dallas_processed.csv')
  las.vegas.in    <- fread('../data/processed_deeds_data/Las_Vegas_processed.csv')
  gwinnett.in    <- fread('../data/processed_deeds_data/Gwinnett_processed.csv')
  
  data.in <- rbind(phoenix.in,orlando.in,dallas.in,las.vegas.in,gwinnett.in) 
  
  
  write.table(data.in,'../data/processed_deeds_data/Combined_processed.csv',row.names=F,col.names=T)
}

processMarkets()
